clear all
use "$path\Raw_Data\ipums_usa.dta" //1980, 1990, 2000 census + 2001-2019 ACS

gen byte native = bpld<10000
label var native "Born in the US"

*adjust for pooling census years
egen new_strata = group(year strata)

gen sampling_rate = 0.05 if year<=2000
	replace sampling_rate = 1/232 if year == 2001
	replace sampling_rate = 1/261 if year == 2002
	replace sampling_rate = 1/236 if year == 2003
	replace sampling_rate = 1/239 if year == 2004
	replace sampling_rate = 1/100 if year >= 2005
	
gen perwt_main = perwt
gen perwt_2011 = perwt
gen perwt_full = perwt
gen perwt_excl_1990 = perwt

*for main analysis, including censuses + 2001-2011 ACS
count if year <= 2011
local tot_obs_1980_2011 = r(N)
levelsof year if year<=2011, local(years)
    foreach y of local years {
	count if year == `y'
	display "year `y' has r(N) obs"
	replace perwt_main = perwt * r(N) / `tot_obs_1980_2011' if year == `y'
	}
*create a 5% cohort from 2009-2013 for Fig 3
count if year>=2009 & year<=2013
local tot_obs_2009_2013 = r(N)
levelsof year if year>=2009 & year<=2013, local(years)
    foreach y of local years {
	count if year == `y'
	display "year `y' has r(N) obs"
	replace perwt_2011 = perwt * r(N) / `tot_obs_2009_2013' if year == `y'
	}

*for assimilation analysis, including censuses + 2001-2019 ACS
count
local tot_obs_1980_2019 = r(N)
levelsof year, local(years)
    foreach y of local years {
	count if year == `y'
	display "year `y' has r(N) obs"
	replace perwt_full = perwt * r(N) / `tot_obs_1980_2019' if year == `y'
	}	

*excluding 1990 census
count if year != 1990
local tot_obs_excl_1990 = r(N)
levelsof year if year != 1990, local(years)
    foreach y of local years {
	count if year == `y'
	display "year `y' has r(N) obs"
	replace perwt_excl_1990 = perwt * r(N) / `tot_obs_excl_1990' if year == `y'
	}		
	

*randomly sample from natives within census year
set seed 123456789
bys year: drop if native == 1 & runiform()>0.1

foreach var in perwt_main perwt_2011 perwt_full perwt_excl_1990{
	replace `var' = `var' * 10 if native == 1
}	


*****************************
* country of birth variables

gen byte Sat = inlist(bpl,456,455,454,452,451)
gen byte Sov = inlist(bpl,460,461,462,463,465)
gen byte EB = max(Sov,Sat)
gen west = (bpl >= 400 & bpl <= 429) | (bpl >= 431 & bpl <= 440) //Western + Northern + Southern (excl. Albania) Europe. Excludes Central/Eastern Europe including Yugoslavia, Austria, Germany.
gen Imm = max(west,EB)
gen west2 = inlist(bpl,400,403,410,411,412,413,414,419,420,421,423,424,425,426,429,431,432,434,435,437,439,440) //excluding countries that joined EU/EEA between 1974 and 2003: Greece, Portugal, Spain, Austria, Finland, Sweden, Iceland, Liechtenstein, Norway
gen Imm2 = max(west2,EB)

***************************************************************************************
* Demographics

*age measure based on birth year (so will be fixed across survey years)
gen age_alt = year - birthyr

rename yrimmig immigration_year

gen immigration_year_approx = 0
	replace immigration_year_approx = 1988.5 if year==1990 & immigration_year==1987
	replace immigration_year_approx = 1985.5 if year==1990 & immigration_year==1985
	replace immigration_year_approx = 1983 if year==1990 & immigration_year==1982
	replace immigration_year_approx = 1980.5 if year==1990 & immigration_year==1980
	replace immigration_year_approx = 1977 if (year==1980 | year==1990) & immigration_year==1975
	replace immigration_year_approx = 1972 if (year==1980 | year==1990) & immigration_year==1970
	replace immigration_year_approx = 1967 if (year==1980 | year==1990) & immigration_year==1965
	replace immigration_year_approx = 1962 if (year==1980 | year==1990) & immigration_year==1960
	replace immigration_year_approx = 1954.5 if (year==1980 | year==1990) & immigration_year==1950
	replace immigration_year_approx = 1949 if (year==1980 | year==1990) & immigration_year==1949 //bottom code
	replace immigration_year_approx = immigration_year if year > 1990 // precise years after this point
	
label var immigration_year_approx "midpoint of year of immigration range"

* age at immigration
gen int immigration_age = age_alt - (year - immigration_year_approx)

replace immigration_age = 0 if immigration_age < 0
replace immigration_age = 900 if native == 1
label var immigration_age "best estimate of age at immigration"
note immigration_age: Natives are coded as 900

*** approximate years in the US

gen years_in_us = year - immigration_year_approx
replace years_in_us = 900 if native == 1
label var years_in_us "Years in US"

gen byte female = sex == 2
drop sex

gen byte married = marst==1 | marst==2 | marst==3
drop marst

***************************************************************************************
* outcome variables

*** employment status
gen byte employed = empstat == 1
label var employed "employed"
drop empstat

***  income
gen income = inctot - incwelfr - incss //exclude welfare and SS income
	replace income = income * cpi99 // inflation adjustment
label var income "total personal income excl. welfare and social sec."

gen log_income = log(income)
label var log_income "log earned income"
drop inctot incwelfr incss cpi99

**occupation
*collapse classification to match Germany
*https://www.ilo.org/wcmsp5/groups/public/---dgreports/---dcomm/---publ/documents/publication/wcms_172572.pdf
* https://en.wikipedia.org/wiki/International_Standard_Classification_of_Occupations

* https://international.ipums.org/international-action/variables/US1980A_0452#codes_section
gen high_occ = occ >= 1 & occ <= 199 if year == 1980
gen high_med_occ = occ >= 1 & occ <= 859 if year == 1980

*https://international.ipums.org/international-action/variables/US1990A_0451#codes_section
replace high_occ = occ >= 1 & occ <= 199 if year == 1990
replace high_med_occ = occ >= 1 & occ <= 859 if year == 1990

*https://international.ipums.org/international-action/variables/US2000A_0449#codes_section
replace high_occ = occ >= 1 & occ <= 365 if year == 2000
replace high_med_occ = occ >= 1 & occ <= 960 if year == 2000

* https://usa.ipums.org/usa/volii/occ_acs.shtml
replace high_occ = occ >= 10 & occ <= 3655 if year >= 2001 & year <= 2019
replace high_med_occ = occ >= 1 & occ <= 9600 if year >= 2001 & year <= 2019

*US census asks about occupation in recent 5-10 years even among currently unemployed or inactive. code these as 0 to match Germany / Israel
replace high_occ = 0 if employed == 0
replace high_med_occ = 0 if employed == 0

drop occ

** education -- ISCED97 recode for consistency across countries
generate isced97 = 0
replace isced97 = 2 if educd >= 23 & educd <= 26 //1 -- grades 6
replace isced97 = 4 if educd >= 30 & educd <= 50  //2 grade 9
replace isced97 = 6 if educd >= 60 & educd <= 64  //3 grade 12 or GED
replace isced97 = 7 if educd >= 65 & educd <= 71  //4 "certificate program" -- count 1 year of college
replace isced97 = 8 if educd >= 80 & educd <= 90  //5B count associate's degree or 3 years of college here
replace isced97 = 9 if educd >= 100 & educd <= 115 //5A college and advanced degrees
replace isced97 = 10 if educd == 116 //6 PhD

*collapse isced97 for Table B4
gen isced97_5a = isced97 >= 9 & isced97 != .
gen isced97_5b = isced97 >= 8 & isced97 != .
gen isced97_4 = isced97 >= 7 & isced97 != .
gen isced97_3 = isced97 >= 6 & isced97 != .

*collapse isced97 for Fig 3
gen isced_int = 1 if inrange(isced97,0,4) // lower secondary or less
replace isced_int = 2 if inrange(isced97,5,6) // upper secondary
replace isced_int = 3 if isced97 == 7 // post-secondary
replace isced_int = 4 if inrange(isced97,8,10) // tertiary

*russia version
gen isced_rus = 1 if inrange(isced97,0,4) // lower secondary or less
replace isced_rus = 2 if inrange(isced97,5,6) // upper secondary
replace isced_rus = 3 if inrange(isced97,7,10) // tertiary incl. post-secondary (not separately categorized in russia census)
	
gen edu_years = 0
	replace edu_years = 2 if educd == 10
	replace edu_years = 2.5 if educd == 13
	replace edu_years = 1 if educd == 14
	replace edu_years = 2 if educd == 15
	replace edu_years = 3 if educd == 16
	replace edu_years = 4 if educd == 17
	replace edu_years = 7 if educd == 20
	replace edu_years = 5.5 if educd == 21
	replace edu_years = 5 if educd == 22
	replace edu_years = 6 if educd == 23
	replace edu_years = 7.5 if educd == 24
	replace edu_years = 7 if educd == 25
	replace edu_years = 8 if educd == 26
	replace edu_years = 9 if educd == 30
	replace edu_years = 10 if educd == 40
	replace edu_years = 11 if educd == 50
	replace edu_years = 12 if educd == 60
	replace edu_years = 11 if educd == 61
	replace edu_years = 12 if educd >= 62 & educd <= 65
	replace edu_years = 13 if educd >= 70 & educd <= 71
	replace edu_years = 14 if educd >= 80 & educd <= 83
	replace edu_years = 15 if educd == 90
	replace edu_years = 16 if educd >= 100 & educd <= 101
	replace edu_years = 17 if educd == 110
	replace edu_years = 18 if educd == 111
	replace edu_years = 19 if educd == 112
	replace edu_years = 20 if educd == 113
	replace edu_years = 18 if educd == 114
	replace edu_years = 18 if educd == 115 //could be 17, 19, or 20 years 
	replace edu_years = 21 if educd == 116

drop educd	
	
*** speaks English well
gen Eng_well = inlist(speakeng,3,4,5)
drop speakeng

** Sample selection
gen sample = (age_alt >= 25 & age_alt <= 65)
	replace sample = 0 if native == 0 & Imm == 0 // include only natives, W. Europeans, and CB migrants
	replace sample = 0 if Imm == 1 & immigration_age <=24 //exclude those arriving before age 25
	replace sample = 0 if Imm == 1 & (immigration_year_approx < 1962) // exclude those who came before 1962 (first post-berlin wall year). will include a small number from 1960-1961 in the 1960 code from years 1980/1990.
	replace sample = 0 if Imm == 1 & (immigration_year_approx > 2003) // exclude those who came after 2003, when many CB countries join EU
	replace sample = 0 if Imm == 1 & (immigration_year == . | immigration_age == .) //can't use these in analysis

gen sample2 = (age_alt >= 25 & age_alt <= 65)
	replace sample2 = 0 if native == 0 & Imm2 == 0 // include only natives, W. Europeans, and CB migrants
	replace sample2 = 0 if Imm2 == 1 & immigration_age <=24 //exclude those arriving before age 25
	replace sample2 = 0 if Imm2 == 1 & (immigration_year_approx < 1973) // exclude those who came before 1973 (first major expansion of EU). will exclude a small number in 1974 from the 1970 code in census years 1980/1990.
	replace sample2 = 0 if Imm2 == 1 & (immigration_year_approx > 2003) // exclude those who came after 2003, when many CB countries join EU
	replace sample2 = 0 if Imm2 == 1 & (immigration_year == . | immigration_age == .) //can't use these in analysis
	
gen sample_allages = 1
	replace sample_allages = 0 if native == 0 & Imm == 0 // include only natives, W. Europeans, and CB migrants
	replace sample_allages = 0 if Imm == 1 & (immigration_year_approx < 1962) // exclude those who came before 1962 (first post-berlin wall year)
	replace sample_allages = 0 if Imm == 1 & (immigration_year_approx > 2003) // exclude those who came after 2003, when many CB countries join EU
	replace sample_allages = 0 if Imm == 1 & (immigration_year == . | immigration_age == .) //can't use these in analysis	
	
gen sample_traj = sample //for trajectories and attrition tests
	replace sample = . if year > 2011 //exclude surveys after 2011 to match German census
	replace sample2 = . if year > 2011
	replace sample_allages = . if year > 2011

*Assign immigrants to year-of-arrival groups
gen post_87 = immigration_year_approx>=1987 & immigration_year_approx != . & Imm == 1
gen post_93 = immigration_year_approx>=1993 & immigration_year_approx != . & Imm == 1
gen pre_86 = immigration_year_approx<=1986 & Imm == 1

*cohorts that fix arrival-year compositions across survey years
gen btw_87_90 = immigration_year_approx>=1987 & immigration_year_approx<=1990 & Imm == 1
gen btw_75_80 = immigration_year_approx>=1975 & immigration_year_approx<=1980 & Imm == 1
gen btw_95_00 = immigration_year_approx>=1995 & immigration_year_approx<=2000  & Imm == 1
gen post_89 = immigration_year_approx >= 1989 & immigration_year_approx != . & Imm == 1 if year != 1990 //can't use 1990 survey here as it groups 1987-1990

gen West = Imm == 1 & EB == 0

*** Interaction Terms
foreach stub in EB Imm West{
	
gen `stub'_btw_87_90 = btw_87_90*`stub'
gen `stub'_btw_75_80 = btw_75_80*`stub'
gen `stub'_btw_95_00 = btw_95_00*`stub'
}

foreach stub in EB Imm {
	
gen `stub'_post_87 = post_87*`stub'
gen `stub'_post_93 = post_93*`stub'
gen `stub'_post_89 = post_89*`stub'

*Individual year coefficeints from 1986 to 1999, only available for years 2000 and onward
forvalues j = 86/98{
gen `stub'_`j' = `stub' * (immigration_year == 19`j') if year >= 2000
}

*interactions with survey years for trajectories and attrition tests
gen `stub'_btw_87_90_y90 = `stub'_btw_87_90 * year == 1990 
gen `stub'_btw_87_90_y00 = `stub'_btw_87_90 * year == 2000 
gen `stub'_btw_87_90_y10 = `stub'_btw_87_90 * year >= 2008 & year <= 2012

gen `stub'_btw_75_80_y80 = `stub'_btw_75_80 * year == 1980 
gen `stub'_btw_75_80_y90 = `stub'_btw_75_80 * year == 1990 
gen `stub'_btw_75_80_y00 = `stub'_btw_75_80 * year == 2000 

gen `stub'_btw_95_00_y00 = `stub'_btw_95_00 * year == 2000 
gen `stub'_btw_95_00_y10 = `stub'_btw_95_00 * year >= 2008 & year <= 2012
gen `stub'_btw_95_00_y17 = `stub'_btw_95_00 * year >= 2015 & year <= 2019
}

foreach stub in Imm2 Sov Sat {

gen `stub'_post_87 = post_87 * `stub'
gen `stub'_post_93 = post_93 * `stub'
}

*collapse birth place states down to country
gen birth_country = bpl
replace birth_country = 99 if bpl < 100
label values birth_country BPL

*bins for control vars
gen age_bin = floor(age_alt/5)*5 //5-yr bins
gen immigration_age_bin = floor(immigration_age/5)*5 //5-yr bins

	replace years_in_us = round(years_in_us,1) //use as FE, not linear control
gen years_in_us_bin = floor(years_in_us/5)*5 //5-yr bins
	replace years_in_us_bin = years_in_us if years_in_us<=5 //1-yr bins for first 5 years


compress
save "$path\Tempfiles\USCensus_vars.dta", replace
